notes

  • from now on I will focus on using seaborn. should be more than enoough for simple EDA purpose

In [1]:
import pandas as pd
import seaborn as sns

%matplotlib inline

prepare data


In [2]:
df = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'])
print(df.shape)
df.head()


(100, 3)
Out[2]:
exam1 exam2 admitted
0 34.623660 78.024693 0
1 30.286711 43.894998 0
2 35.847409 72.902198 0
3 60.182599 86.308552 1
4 79.032736 75.344376 1

In [3]:
df.describe()


Out[3]:
exam1 exam2 admitted
count 100.000000 100.000000 100.000000
mean 65.644274 66.221998 0.600000
std 19.458222 18.582783 0.492366
min 30.058822 30.603263 0.000000
25% 50.919511 48.179205 0.000000
50% 67.032988 67.682381 1.000000
75% 80.212529 79.360605 1.000000
max 99.827858 98.869436 1.000000

seaborn


In [7]:
sns.set(context="notebook", style="darkgrid", palette=sns.color_palette("RdBu", 2))

sns.lmplot('exam1', 'exam2', hue='admitted', data=df, 
           size=6, 
           fit_reg=False, 
           scatter_kws={"s": 50}
          )


Out[7]:
<seaborn.axisgrid.FacetGrid at 0x115491da0>

In [ ]: